"""
实现多线程的爬虫
"""
# 导入Request
import requests
# 导入bs4和 lxml
from bs4 import BeautifulSoup
import lxml
# 导入多线程模块
import threading
# 导入队列
from queue import Queue
# 导入os
import os
# 导入下载图片
from urllib import request


class iLyncSpider:
    def __init__(self):
        # 课程概况的URL
        self.base_url = "http://www.ilync.cn/org/6818_d_0_0_-1_-1_0_{}"
        # 课程明细页的URL
        self.base_detail_url = "http://www.ilync.cn/kecheng/detail_{}?f=org_coursecenter"
        # 每页课程数量
        self.pages_number = 12

        # 准备headers
        self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; " + \
                                      "Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + \
                                      "Chrome/69.0.3497.100 Safari/537.36"}

        # 实例化一个URL队列
        self.url_queue = Queue()
        # 定义一个html文本的队列
        self.html_queue = Queue()
        # 定义一个content队列
        self.content_queue = Queue()

    def get_url_list(self):
        """获取URL列表"""
        # 获取URL
        for i in range(1, self.pages_number + 1):
            # 把URL添加到队列
            self.url_queue.put(self.base_url.format(i))

    def parse_url(self):
        """根据URL获取页面文本"""
        while True:
            # 在url队列中取出一个url
            url = self.url_queue.get()
            # 获取当前url对应页面的文本
            response = requests.get(url, headers=self.headers)
            # 把返回的内容放进html文本队列中
            self.html_queue.put(response.content.decode("utf-8"))
            # 把取出的url给完成！
            self.url_queue.task_done()  # 取出的这个URL在url_queue队列中删除

    def get_content_list(self):
        """根据页面的文本筛选出数据"""
        while True:
            # 取出获取队列中的html_str
            html_str = self.html_queue.get()
            # 实例化BS4对象
            soup = BeautifulSoup(html_str, 'lxml')
            # 第一次筛选
            first_filter = soup.find_all('div', class_='course-list-wrap')[0]
            # 第二次筛选
            second_filter = first_filter.find_all('div', class_="grid-cell")
            # 定义一个集合 --- [{},{},{},{},....]
            courses_infos = []
            # 使用循环来遍历第二次筛选的内容
            for one in second_filter:
                # 临时定义个字典
                temp_dict = {}
                # 获取课程编号
                id_str = one.find('a', class_="course-pic").attrs['href']
                temp_dict['id'] = id_str[id_str.find('_') + 1: id_str.find('?')]
                # 获取课程明细的urL
                temp_dict['url'] = self.base_detail_url.format(temp_dict['id'])
                # # 获取明细的信息
                # temp_dict.update(get_course_detail_info(temp_dict['url']))
                # 获取图片地址
                img = one.find('img').attrs['src']
                temp_dict['img'] = img
                # # 下载图片
                self.download_image(img)
                # 获取课程名称
                title = one.find('img').attrs['title']
                temp_dict['title'] = title
                # 获取课程类别
                type = one.find('div', class_="course-courseware-cate").text
                temp_dict['type'] = type
                # 价格--
                price_list = one.find('div', class_="course-price").text.replace("\t", "").split('\n')
                # 遍历并判断
                for one_price in price_list:
                    if "免费" in one_price or "." in one_price:
                        temp_dict['price'] = one_price
                # 课时
                time = one.find('div', class_='course-courseware-num').find('span').text
                temp_dict['times'] = time
                # 附加到集合
                courses_infos.append(temp_dict)
            # 添加到队列
            self.content_queue.put(courses_infos)
            # 完成当前的进程
            self.html_queue.task_done()

    def save_content_list(self):
        """把数据保存"""
        while True:
            # 获取队列中内容
            content_list = self.content_queue.get()
            # 输出
            for i in content_list:
                self.download_image(i['img'])
                print(i)
            # 关闭
            self.content_queue.task_done()

    def download_image(self, url: str):
        "下载图片"
        filename = url.split("/").pop()
        path = os.path.join('images', filename)
        request.urlretrieve(url, path)

    def run(self):
        """使用多线程调度"""
        # 定义一个进程集合
        thread_list = []
        # ----------获取url -----------
        t_url = threading.Thread(target=self.get_url_list)
        # 添加到集合中
        thread_list.append(t_url)
        # ----------获取网页文文本---------
        for i in range(10):
            t_parse = threading.Thread(target=self.parse_url)
            # 附加到集合中
            thread_list.append(t_parse)
        # ----------获取有效数据------------
        for i in range(4):
            t_content = threading.Thread(target=self.get_content_list)
            thread_list.append(t_content)
        # ----------保存数据-----------------
        for i in range(10):
            t_save = threading.Thread(target=self.save_content_list)
            thread_list.append(t_save)

        # 启动所有的线程
        for t in thread_list:
            # 启用守护进程
            t.setDaemon(True)
            # 启动
            t.start()

        # 队列中所有的全部完成，程序结束
        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join() # 等待所有队列的元素全部处理完后程序才继续执行

        print("所有数据获取完成！")


